library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.3
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(mvtnorm)
library(skimr)
## 
## Attaching package: 'skimr'
## The following object is masked from 'package:stats':
## 
##     filter
library(ggthemes)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
banktrain <- read.csv("https://raw.githubusercontent.com/JaclynCoate/6372_Project_2/master/Data/Training_Test_Splits/banktrain_raw.csv", header = TRUE, sep = ",", strip.white = TRUE)

banktest <- read.csv("https://raw.githubusercontent.com/JaclynCoate/6372_Project_2/master/Data/Training_Test_Splits/banktest_raw.csv", header = TRUE, sep = ",", strip.white = TRUE)
skim(banktrain)
## Skim summary statistics
##  n obs: 4718 
##  n variables: 22 
## 
## ── Variable type:factor ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
##      variable missing complete    n n_unique
##       contact       0     4718 4718        2
##   day_of_week       0     4718 4718        5
##       default       0     4718 4718        2
##     education       0     4718 4718        8
##       housing       0     4718 4718        3
##           job       0     4718 4718       12
##          loan       0     4718 4718        3
##       marital       0     4718 4718        4
##         month       0     4718 4718       10
##      poutcome       0     4718 4718        3
##  Subscription       0     4718 4718        2
##                                top_counts ordered
##               cel: 3391, tel: 1327, NA: 0   FALSE
##   thu: 1040, wed: 953, mon: 941, tue: 922   FALSE
##                 no: 3962, unk: 756, NA: 0   FALSE
##  uni: 1544, hig: 1079, bas: 606, pro: 569   FALSE
##      yes: 2481, no: 2129, unk: 108, NA: 0   FALSE
##   adm: 1311, blu: 893, tec: 698, ser: 406   FALSE
##       no: 3925, yes: 685, unk: 108, NA: 0   FALSE
##   mar: 2710, sin: 1485, div: 506, unk: 17   FALSE
##   may: 1282, jul: 783, aug: 699, jun: 602   FALSE
##      non: 3707, fai: 523, suc: 488, NA: 0   FALSE
##                no: 2362, yes: 2356, NA: 0   FALSE
## 
## ── Variable type:integer ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
##  variable missing complete    n     mean       sd p0   p25     p50
##       age       0     4718 4718    40.34    11.85 17    31    38  
##  campaign       0     4718 4718     2.34     2.36  1     1     2  
##  duration       0     4718 4718   385.22   356.47  4   143   266  
##     pdays       0     4718 4718   886.8    314.44  0   999   999  
##  previous       0     4718 4718     0.31     0.69  0     0     0  
##       row       0     4718 4718 24821.99 12454.11  7 14311 27457.5
##       p75  p100     hist
##     48       98 ▂▇▅▃▁▁▁▁
##      3       43 ▇▁▁▁▁▁▁▁
##    517     3631 ▇▂▁▁▁▁▁▁
##    999      999 ▁▁▁▁▁▁▁▇
##      0        6 ▇▂▁▁▁▁▁▁
##  36756.25 41187 ▂▂▃▃▂▃▃▇
## 
## ── Variable type:numeric ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
##        variable missing complete    n    mean    sd      p0     p25
##   cons_conf_idx       0     4718 4718  -40.18  5.33  -50.8   -42.7 
##  cons_price_idx       0     4718 4718   93.48  0.64   92.2    92.89
##    emp_var_rate       0     4718 4718   -0.49  1.73   -3.4    -1.8 
##       euribor3m       0     4718 4718    2.97  1.89    0.63    1.24
##     nr_employed       0     4718 4718 5136.19 86.48 4963.6  5076.2 
##      p50     p75    p100     hist
##   -41.8   -36.4   -26.9  ▁▆▆▆▇▂▂▁
##    93.44   93.99   94.77 ▂▁▇▅▁▇▁▂
##    -0.1     1.4     1.4  ▃▁▆▁▁▁▁▇
##     4.02    4.96    5.04 ▅▅▁▁▁▁▁▇
##  5191    5228.1  5228.1  ▂▂▁▂▅▁▃▇
nrow(banktrain)
## [1] 4718
skim(banktest)
## Skim summary statistics
##  n obs: 36470 
##  n variables: 22 
## 
## ── Variable type:factor ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
##      variable missing complete     n n_unique
##       contact       0    36470 36470        2
##   day_of_week       0    36470 36470        5
##       default       0    36470 36470        3
##     education       0    36470 36470        8
##       housing       0    36470 36470        3
##           job       0    36470 36470       12
##          loan       0    36470 36470        3
##       marital       0    36470 36470        4
##         month       0    36470 36470       10
##      poutcome       0    36470 36470        3
##  Subscription       0    36470 36470        2
##                                   top_counts ordered
##                cel: 22753, tel: 13717, NA: 0   FALSE
##   thu: 7583, mon: 7573, wed: 7181, tue: 7168   FALSE
##          no: 28626, unk: 7841, yes: 3, NA: 0   FALSE
##  uni: 10624, hig: 8436, bas: 5439, pro: 4674   FALSE
##       yes: 19095, no: 16493, unk: 882, NA: 0   FALSE
##   adm: 9111, blu: 8361, tec: 6045, ser: 3563   FALSE
##        no: 30025, yes: 5563, unk: 882, NA: 0   FALSE
##   mar: 22218, sin: 10083, div: 4106, unk: 63   FALSE
##  may: 12487, jul: 6391, aug: 5479, jun: 4716   FALSE
##       non: 31856, fai: 3729, suc: 885, NA: 0   FALSE
##                  no: 34186, yes: 2284, NA: 0   FALSE
## 
## ── Variable type:integer ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
##  variable missing complete     n     mean       sd p0     p25     p50
##       age       0    36470 36470    39.98    10.22 17   32       38  
##  campaign       0    36470 36470     2.6      2.82  1    1        2  
##  duration       0    36470 36470   241.86   239.02  0   99      172  
##     pdays       0    36470 36470   972.27   160.72  0  999      999  
##  previous       0    36470 36470     0.16     0.46  0    0        0  
##       row       0    36470 36470 20047.6  11704.32  1 9915.25 19923.5
##       p75  p100     hist
##     47       98 ▂▇▆▃▁▁▁▁
##      3       56 ▇▁▁▁▁▁▁▁
##    300     4918 ▇▁▁▁▁▁▁▁
##    999      999 ▁▁▁▁▁▁▁▇
##      0        7 ▇▁▁▁▁▁▁▁
##  30058.75 41188 ▇▇▇▇▇▇▇▆
## 
## ── Variable type:numeric ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
##        variable missing complete     n    mean    sd      p0     p25
##   cons_conf_idx       0    36470 36470  -40.54  4.53  -50.8   -42.7 
##  cons_price_idx       0    36470 36470   93.59  0.57   92.2    93.08
##    emp_var_rate       0    36470 36470    0.16  1.53   -3.4    -1.8 
##       euribor3m       0    36470 36470    3.71  1.7     0.63    1.41
##     nr_employed       0    36470 36470 5171.03 69.2  4963.6  5099.1 
##      p50     p75    p100     hist
##   -41.8   -36.4   -26.9  ▁▅▆▃▇▁▁▁
##    93.92   93.99   94.77 ▁▁▅▅▁▇▁▂
##     1.1     1.4     1.4  ▁▁▃▁▁▁▁▇
##     4.86    4.96    5.04 ▁▃▁▁▁▁▁▇
##  5191    5228.1  5228.1  ▁▁▁▁▃▁▃▇
nrow(banktest)
## [1] 36470

Upon completion of our first EDA we will resume the EDA in order to accomplish an LDA model

#Remove categorical predictors
banktrain2 <- dplyr::select(banktrain, -c("age", "duration", "job", "marital", "education", "default", "housing", "loan", "contact", "month", "day_of_week", "poutcome"))
banktrain3 <- dplyr::select(banktrain2, -c("pdays", "nr_employed", "emp_var_rate", "row", "previous", "campaign"))
skim(banktrain3)
## Skim summary statistics
##  n obs: 4718 
##  n variables: 4 
## 
## ── Variable type:factor ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
##      variable missing complete    n n_unique                 top_counts
##  Subscription       0     4718 4718        2 no: 2362, yes: 2356, NA: 0
##  ordered
##    FALSE
## 
## ── Variable type:numeric ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
##        variable missing complete    n   mean   sd     p0    p25    p50
##   cons_conf_idx       0     4718 4718 -40.18 5.33 -50.8  -42.7  -41.8 
##  cons_price_idx       0     4718 4718  93.48 0.64  92.2   92.89  93.44
##       euribor3m       0     4718 4718   2.97 1.89   0.63   1.24   4.02
##     p75   p100     hist
##  -36.4  -26.9  ▁▆▆▆▇▂▂▁
##   93.99  94.77 ▂▁▇▅▁▇▁▂
##    4.96   5.04 ▅▅▁▁▁▁▁▇
invisible(view(banktrain3))

Paris Scatter Plot

banktrain3 %>% pairs(,col=banktrain3$Subscription)

LDA

mylda <- lda(Subscription~ cons_price_idx + cons_conf_idx + euribor3m,data=banktrain)
#Predictions can come in many forms, the class form provides the categorical level of your response
Pred <- predict(mylda,newdata = banktest)$class
#Manual Creating confusion matrix
Truth <- banktest$Subscription
x <- table(Pred,Truth) 
x
##      Truth
## Pred     no   yes
##   no  24665   666
##   yes  9521  1618
CMlda <- confusionMatrix(Pred,Truth)
CMlda
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    no   yes
##        no  24665   666
##        yes  9521  1618
##                                          
##                Accuracy : 0.7207         
##                  95% CI : (0.716, 0.7253)
##     No Information Rate : 0.9374         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.153          
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.7215         
##             Specificity : 0.7084         
##          Pos Pred Value : 0.9737         
##          Neg Pred Value : 0.1453         
##              Prevalence : 0.9374         
##          Detection Rate : 0.6763         
##    Detection Prevalence : 0.6946         
##       Balanced Accuracy : 0.7150         
##                                          
##        'Positive' Class : no             
## 

QDA

myqda <- qda(Subscription~ cons_price_idx + cons_conf_idx + euribor3m,data=banktrain)
#Predictions can come in many forms, the class form provides the categorical level of your response
PredQ <- predict(myqda,newdata = banktest)$class
#Creating confusion matrix
TruthQ <- banktest$Subscription
x <- table(PredQ,TruthQ)
x
##      TruthQ
## PredQ    no   yes
##   no  31607  1204
##   yes  2579  1080
CMqda <- confusionMatrix(PredQ, TruthQ)
CMqda
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    no   yes
##        no  31607  1204
##        yes  2579  1080
##                                           
##                Accuracy : 0.8963          
##                  95% CI : (0.8931, 0.8994)
##     No Information Rate : 0.9374          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3103          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9246          
##             Specificity : 0.4729          
##          Pos Pred Value : 0.9633          
##          Neg Pred Value : 0.2952          
##              Prevalence : 0.9374          
##          Detection Rate : 0.8667          
##    Detection Prevalence : 0.8997          
##       Balanced Accuracy : 0.6987          
##                                           
##        'Positive' Class : no              
## 

3D Graph of Continuous Variables (predictors) colored by Subscription (dependent variable)

fig <- plot_ly(banktrain, x= ~cons_price_idx, y= ~cons_conf_idx, z= ~euribor3m, color = ~Subscription, colors = c('#BF382A', '#228B22'))
fig2 <- fig %>% add_markers()
fig3 <- fig2 %>% layout(scene = list(xaxis = list(title = 'Consumer Price Indx'),
                     yaxis = list(title = 'Consumer Confidence Indx'),
                     zaxis = list(title = '3 Month Rate')))
fig3